package com.linghang.wusthelper.spider.library;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * @author origin
 * 图书馆公告内容爬虫
 */
public class LibAnnContentSpider implements PageProcessor {

    private StringBuilder stringBuilder;

    private Site site = Site.me().setTimeOut(3000).setRetryTimes(3)
            .setDomain("http://www.lib.wust.edu.cn")
            .addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
            .addHeader("Accept-Encoding","gzip,deflate,sdch")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.8")
            .addHeader("Connection", "keep-alive")
            .addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");


    public LibAnnContentSpider(StringBuilder stringBuilder){
        this.stringBuilder = stringBuilder;
    }

    @Override
    public void process(Page page) {
        String content = page.getHtml().xpath("//*[@id=\"content\"]/tbody/tr/td[2]/table").get();
        synchronized (stringBuilder){
            stringBuilder.append(content);
            stringBuilder.notify();
        }

    }

    @Override
    public Site getSite() {
        return site;
    }
}
